park_visits <- readr::read_csv(here::here("data/park-visits.csv"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## year = col_character(),
## gnis_id = col_character(),
## geometry = col_character(),
## metadata = col_character(),
## number_of_records = col_double(),
## parkname = col_character(),
## region = col_character(),
## state = col_character(),
## unit_code = col_character(),
## unit_name = col_character(),
## unit_type = col_character(),
## visitors = col_double()
## )
gas_price <- readr::read_csv(here::here("data/gas-price.csv"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## year = col_double(),
## gas_current = col_double(),
## gas_constant = col_double()
## )
state_pop <- readr::read_csv(here::here("data/state-pop.csv"))
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## year = col_double(),
## state = col_character(),
## pop = col_double()
## )
Initial cleaning
visdat::vis_dat(park_visits)
naniar::miss_var_summary(park_visits)
## # A tibble: 12 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 metadata 2712 12.6
## 2 parkname 2218 10.3
## 3 visitors 4 0.0186
## 4 year 0 0
## 5 gnis_id 0 0
## 6 geometry 0 0
## 7 number_of_records 0 0
## 8 region 0 0
## 9 state 0 0
## 10 unit_code 0 0
## 11 unit_name 0 0
## 12 unit_type 0 0
summary(park_visits)
## year gnis_id geometry metadata
## Length:21560 Length:21560 Length:21560 Length:21560
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## number_of_records parkname region state
## Min. :1 Length:21560 Length:21560 Length:21560
## 1st Qu.:1 Class :character Class :character Class :character
## Median :1 Mode :character Mode :character Mode :character
## Mean :1
## 3rd Qu.:1
## Max. :1
##
## unit_code unit_name unit_type visitors
## Length:21560 Length:21560 Length:21560 Min. : 0
## Class :character Class :character Class :character 1st Qu.: 39125
## Mode :character Mode :character Mode :character Median : 155219
## Mean : 1277105
## 3rd Qu.: 608144
## Max. :871922828
## NA's :4
# are columns completely unique for their length?
vec_size(park_visits)
## [1] 21560
completely_unique <- function(x) vec_unique_count(x) == 1
prop_unique <- function(x) (vec_unique_count(x) / vec_size(x))
# is everything totally unique?
map_lgl(park_visits, completely_unique)
## year gnis_id geometry metadata
## FALSE FALSE FALSE FALSE
## number_of_records parkname region state
## TRUE FALSE FALSE FALSE
## unit_code unit_name unit_type visitors
## FALSE FALSE FALSE FALSE
map_lgl(park_visits, completely_unique) %>% any()
## [1] TRUE
map_lgl(park_visits, completely_unique) %>% which()
## number_of_records
## 5
# how unique are they?
map_dfr(park_visits, prop_unique) %>%
pivot_longer(cols = everything()) %>%
arrange(-value)
## # A tibble: 12 x 2
## name value
## <chr> <dbl>
## 1 visitors 0.877
## 2 unit_name 0.0179
## 3 gnis_id 0.0175
## 4 unit_code 0.0175
## 5 parkname 0.0156
## 6 metadata 0.0153
## 7 year 0.00529
## 8 state 0.00250
## 9 unit_type 0.00139
## 10 region 0.000371
## 11 geometry 0.0000928
## 12 number_of_records 0.0000464
Let’s count the number of visitors
count(park_visits, visitors)
## # A tibble: 18,916 x 2
## visitors n
## <dbl> <int>
## 1 0 180
## 2 5 2
## 3 7 2
## 4 10 2
## 5 12 2
## 6 14 2
## 7 15 2
## 8 17 2
## 9 19 4
## 10 20 1
## # … with 18,906 more rows
And how many different types of patterns do we have? We can count them twice
count(park_visits, visitors) %>%
count(n)
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
## # A tibble: 16 x 2
## n nn
## <int> <int>
## 1 1 17419
## 2 2 1058
## 3 3 209
## 4 4 112
## 5 5 53
## 6 6 22
## 7 7 19
## 8 8 7
## 9 9 5
## 10 10 2
## 11 11 3
## 12 13 1
## 13 14 3
## 14 15 1
## 15 16 1
## 16 180 1
Lots of single visits, apparently?
What is in number of records?
n_distinct(park_visits$number_of_records)
## [1] 1
park_visits_tidy <- park_visits %>%
mutate(year = parse_number(year)) %>%
# doesn't contain anything useful
select(-number_of_records) %>%
# I want the visitor number after year
relocate(visitors, .after = year)
## Warning: Problem with `mutate()` input `year`.
## ℹ 386 parsing failures.
## row col expected actual
## 301 -- a number Total
## 309 -- a number Total
## 338 -- a number Total
## 370 -- a number Total
## 387 -- a number Total
## ... ... ........ ......
## See problems(...) for more details.
##
## ℹ Input `year` is `parse_number(year)`.
## Warning: 386 parsing failures.
## row col expected actual
## 301 -- a number Total
## 309 -- a number Total
## 338 -- a number Total
## 370 -- a number Total
## 387 -- a number Total
## ... ... ........ ......
## See problems(...) for more details.
park_visits_tidy
## # A tibble: 21,560 x 11
## year visitors gnis_id geometry metadata parkname region state unit_code
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1904 1500 1163670 POLYGON <NA> Crater … PW OR CRLA
## 2 1941 0 1531834 MULTIPO… <NA> Lake Ro… PW WA LARO
## 3 1961 69000 2055170 MULTIPO… <NA> Lewis a… PW WA LEWI
## 4 1935 2200 1530459 MULTIPO… <NA> Olympic PW WA OLYM
## 5 1982 468144 277263 POLYGON <NA> Santa M… PW CA SAMO
## 6 1919 64000 578853 MULTIPO… <NA> <NA> NE ME ACAD
## 7 1969 448000 1329499 MULTIPO… <NA> <NA> IM TX AMIS
## 8 1967 738700 589056 POLYGON <NA> <NA> NE MD ASIS
## 9 1944 1409 1377082 POLYGON <NA> <NA> IM TX BIBE
## 10 1989 81157 302659 POLYGON <NA> <NA> SE FL BICY
## # … with 21,550 more rows, and 2 more variables: unit_name <chr>,
## # unit_type <chr>
Let’s define this as a time series tsibble. However, to do so, we’ll need to identify the index and key
What defines a park?
parkname: character Full park nameunit_code: character Park code abbreviationunit_name: character Park Unit nameunit_type: character Park unit typeAre they the same length?
park_visits_tidy %>%
summarise_at(.vars = vars(parkname, unit_code, unit_name),
.funs = n_distinct)
## # A tibble: 1 x 3
## parkname unit_code unit_name
## <int> <int> <int>
## 1 337 378 386
Ok let’s go unit_name to keep things simple, especially since parkname has a bunch of missings. I think technically a park could be in multiple states so we won’t add hierarchy here. We’ll also remove the geometry column since it creates issued with tsibble.
inspectdf::inspect_mem(park_visits_tidy) %>% inspectdf::show_plot()
So the visitor counts is the thing that I’m most interest in, I think.
ggplot(park_visits_tidy,
aes(x = year,
y = visitors)) +
geom_point()
## Warning: Removed 386 rows containing missing values (geom_point).
outliers?
ggplot(park_visits_tidy,
aes(x = visitors)) +
geom_boxplot()
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
Where do these occur? Let’s make the data a little bit smaller
park_visits_cut <- park_visits_tidy %>%
select(year,
visitors,
region:unit_type)
park_visits_cut %>%
filter(visitors > 2.5e8)
## # A tibble: 8 x 7
## year visitors region state unit_code unit_name unit_type
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 NA 871922828 NT NC BLRI Blue Ridge Parkway Parkway
## 2 NA 521947058 SE NC GRSM Great Smoky Mountains… National Park
## 3 NA 330201962 NC VA GWMP George Washington Mem… National Parkway
## 4 NA 411700377 PW NV LAKE Lake Mead National Re… National Recrea…
## 5 NA 611031225 PW CA GOGA Golden Gate National … National Recrea…
## 6 NA 329664174 NE NY GATE Gateway National Recr… National Recrea…
## 7 NA 443145232 SE MS NATR Natchez Trace Parkway Parkway
## 8 NA 282420671 NE VA COLO Colonial National His… National Histor…
Do we get large numbers of visitors when there are missing values?
library(naniar)
miss_var_summary(park_visits_cut)
## # A tibble: 7 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 year 386 1.79
## 2 visitors 4 0.0186
## 3 region 0 0
## 4 state 0 0
## 5 unit_code 0 0
## 6 unit_name 0 0
## 7 unit_type 0 0
gg_miss_upset(park_visits_cut)
park_visits_cut_nab <- nabular(park_visits_cut)
park_visits_cut_nab %>%
group_by(year_NA) %>%
summarise(max_visitors = max(visitors, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## year_NA max_visitors
## <fct> <dbl>
## 1 !NA 21767176
## 2 NA 871922828
ggplot(park_visits_cut_nab,
aes(x = year_NA,
y = visitors)) +
geom_boxplot()
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
OK so based on that I think I should remove the rows that contain missing values for year and visitor_NA.
park_visits_cut_na <- park_visits_cut %>%
drop_na()
OK now let’s add on data for the state population, and the gas price, to make things a bit more interesting. Let’s quickly
park_visits_ts <- park_visits_cut_na %>%
left_join(gas_price, by = "year") %>%
left_join(state_pop, by = c("year", "state")) %>%
as_tsibble(key = unit_name,
index = year)
park_visits_ts
## # A tsibble: 21,174 x 10 [1Y]
## # Key: unit_name [382]
## year visitors region state unit_code unit_name unit_type gas_current
## <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 1934 175000 SE KY ABLI Abraham … National… 0.19
## 2 1935 89355 SE KY ABLI Abraham … National… 0.19
## 3 1936 149665 SE KY ABLI Abraham … National… 0.19
## 4 1937 111840 SE KY ABLI Abraham … National… 0.2
## 5 1938 121144 SE KY ABLI Abraham … National… 0.2
## 6 1939 112626 SE KY ABLI Abraham … National… 0.19
## 7 1940 136945 SE KY ABLI Abraham … National… 0.18
## 8 1941 110830 SE KY ABLI Abraham … National… 0.19
## 9 1942 59210 SE KY ABLI Abraham … National… 0.2
## 10 1943 13520 SE KY ABLI Abraham … National… 0.21
## # … with 21,164 more rows, and 2 more variables: gas_constant <dbl>, pop <dbl>
OK, so now we get some nice spaghetti.
gg_park_visits_spag <- ggplot(park_visits_ts,
aes(x = year,
y = visitors,
group = unit_name)) +
geom_line()
Let’s use brolgar to help inspect that
library(brolgar)
gg_park_visits_spag + facet_strata(along = pop)
gg_park_visits_spag + facet_strata(along = gas_constant)
Let’s use keys_near to identify those parks near the five number summary:
park_visits_ts %>%
keys_near(visitors)
## # A tibble: 185 x 5
## unit_name visitors stat stat_value stat_diff
## <chr> <dbl> <fct> <dbl> <dbl>
## 1 Aniakchak National Monument 0 min 0 0
## 2 Aniakchak National Preserve 0 min 0 0
## 3 Appomattox Court House National Historic… 148300 med 148300 0
## 4 Big Hole National Battlefield 0 min 0 0
## 5 Big Hole National Battlefield 0 min 0 0
## 6 Brices Cross Roads National Battlefield … 0 min 0 0
## 7 Brices Cross Roads National Battlefield … 0 min 0 0
## 8 Brices Cross Roads National Battlefield … 0 min 0 0
## 9 Brices Cross Roads National Battlefield … 0 min 0 0
## 10 Brices Cross Roads National Battlefield … 0 min 0 0
## # … with 175 more rows
huh, filter out the zeros?
park_visits_ts %>%
filter(visitors > 0) %>%
keys_near(visitors)
## # A tibble: 10 x 5
## unit_name visitors stat stat_value stat_diff
## <chr> <dbl> <fct> <dbl> <dbl>
## 1 Appomattox Court House National Historic… 152200 med 152207 7
## 2 Arkansas Post National Memorial 39703 q_25 39703. 0.25
## 3 Bryce Canyon National Park 579200 q_75 579200 0
## 4 Denali National Park 5 min 5 0
## 5 Denali National Preserve 5 min 5 0
## 6 Everglades National Park 579200 q_75 579200 0
## 7 Golden Gate National Recreation Area 21767176 max 21767176 0
## 8 Herbert Hoover National Historic Site 152214 med 152207 7
## 9 Lincoln Boyhood National Memorial 152200 med 152207 7
## 10 Lyndon B. Johnson National Historical Pa… 579200 q_75 579200 0
park_visits_ts_near <- park_visits_ts %>%
filter(visitors > 0) %>%
keys_near(visitors) %>%
select(-visitors) %>%
left_join(park_visits_ts, by = "unit_name")
park_visits_ts_near
## # A tibble: 664 x 13
## unit_name stat stat_value stat_diff year visitors region state unit_code
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
## 1 Appomatt… med 152207 7 1941 50000 NE VA APCO
## 2 Appomatt… med 152207 7 1942 9750 NE VA APCO
## 3 Appomatt… med 152207 7 1943 3125 NE VA APCO
## 4 Appomatt… med 152207 7 1944 5450 NE VA APCO
## 5 Appomatt… med 152207 7 1945 7850 NE VA APCO
## 6 Appomatt… med 152207 7 1946 19400 NE VA APCO
## 7 Appomatt… med 152207 7 1947 25250 NE VA APCO
## 8 Appomatt… med 152207 7 1948 27750 NE VA APCO
## 9 Appomatt… med 152207 7 1949 34150 NE VA APCO
## 10 Appomatt… med 152207 7 1950 55400 NE VA APCO
## # … with 654 more rows, and 4 more variables: unit_type <chr>,
## # gas_current <dbl>, gas_constant <dbl>, pop <dbl>
plot these
library(stickylabeller)
ggplot(park_visits_ts_near,
aes(x = year,
y = visitors,
colour = stat)) +
geom_line() +
facet_wrap(~stat + unit_name,
scales = "free_y",
labeller = label_glue("Park: {unit_name} \nNearest to {stat}")) +
theme(legend.position = "none")
Now plot them by the slope of year, another way of looking at it
park_visits_ts_lm_near <- park_visits_ts %>%
filter(visitors > 0) %>%
key_slope(visitors ~ year) %>%
keys_near(key = unit_name,
var = .slope_year,
top_n = 6) %>%
left_join(park_visits_ts, by = "unit_name")
library(stickylabeller)
ggplot(park_visits_ts_lm_near,
aes(x = year,
y = visitors,
colour = stat)) +
geom_smooth(method = "lm", se = FALSE,
colour = "grey50",
alpha = 0.5,
size = 0.5) +
geom_line() +
facet_wrap(~stat + unit_name,
scales = "free_y",
labeller = label_glue("Park: {unit_name} \nNearest to {stat} slope"),
nrow = 10) +
theme(legend.position = "none")
## `geom_smooth()` using formula 'y ~ x'
Now can we add in the other variables, population size and gas to see if there’s something interesting there?
park_visits_lm_gas_pop <- park_visits_ts %>%
drop_na(pop) %>%
filter(visitors > 0) %>%
key_slope(visitors ~ year + pop + gas_constant)
park_visits_ts_lm_pop_near <- park_visits_lm_gas_pop %>%
keys_near(key = unit_name,
var = .slope_pop,
top_n = 6) %>%
left_join(park_visits_ts, by = "unit_name")
library(stickylabeller)
ggplot(park_visits_ts_lm_near,
aes(x = year,
y = visitors,
colour = stat)) +
geom_smooth(method = "lm", se = FALSE,
colour = "grey50",
alpha = 0.5,
size = 0.5) +
geom_line() +
facet_wrap(~stat + unit_name,
scales = "free_y",
labeller = label_glue("Park: {unit_name} \nNearest to {stat} slope"),
nrow = 10) +
theme(legend.position = "none")
## `geom_smooth()` using formula 'y ~ x'
Are some of them always increasing?
summary(gas_price)
## year gas_current gas_constant
## Min. :1929 Min. :0.1700 Min. :1.470
## 1st Qu.:1950 1st Qu.:0.2700 1st Qu.:1.830
## Median :1972 Median :0.3600 Median :2.040
## Mean :1972 Mean :0.9001 Mean :2.202
## 3rd Qu.:1994 3rd Qu.:1.1650 3rd Qu.:2.470
## Max. :2015 Max. :3.6400 Max. :3.800
ggplot(gas_price,
aes(x = year,
y = gas_constant)) +
geom_line()
state_pop
## # A tibble: 5,916 x 3
## year state pop
## <dbl> <chr> <dbl>
## 1 1900 AL 1830000
## 2 1901 AL 1907000
## 3 1902 AL 1935000
## 4 1903 AL 1957000
## 5 1904 AL 1978000
## 6 1905 AL 2012000
## 7 1906 AL 2045000
## 8 1907 AL 2058000
## 9 1908 AL 2070000
## 10 1909 AL 2108000
## # … with 5,906 more rows
## datetime
Sys.time()
## [1] "2020-10-19 10:52:58 AEDT"
## repository
if(requireNamespace('git2r', quietly = TRUE)) {
git2r::repository()
} else {
c(
system2("git", args = c("log", "--name-status", "-1"), stdout = TRUE),
system2("git", args = c("remote", "-v"), stdout = TRUE)
)
}
## Local: /Users/ntie0001/github/njtierney/tidy-tue-npv
## Head: nothing commited (yet)
## session info
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] stickylabeller_0.0.0.9100 brolgar_0.0.6.9100
## [3] naniar_0.6.0.9000 tsibble_0.9.3.9000
## [5] vctrs_0.3.4 forcats_0.5.0
## [7] stringr_1.4.0 dplyr_1.0.2
## [9] purrr_0.3.4 readr_1.4.0
## [11] tidyr_1.1.2 tibble_3.0.4
## [13] ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] httr_1.4.2 splines_4.0.2 jsonlite_1.7.1
## [4] here_0.1 modelr_0.1.8 assertthat_0.2.1
## [7] distributional_0.2.0 askpass_1.1 conflicted_1.0.4
## [10] blob_1.2.1 fabletools_0.2.1 cellranger_1.1.0
## [13] yaml_2.2.1 progress_1.2.2 lattice_0.20-41
## [16] pillar_1.4.6 backports_1.1.10 glue_1.4.2
## [19] visdat_0.5.3 digest_0.6.26 rvest_0.3.6
## [22] colorspace_1.4-1 Matrix_1.2-18 htmltools_0.5.0
## [25] plyr_1.8.6 pkgconfig_2.0.3 broom_0.7.0
## [28] haven_2.3.1 scales_1.1.1 git2r_0.27.1
## [31] openssl_1.4.3 mgcv_1.8-33 generics_0.0.2
## [34] farver_2.0.3 ellipsis_0.3.1 UpSetR_1.4.0
## [37] withr_2.3.0 inspectdf_0.0.9 credentials_1.3.0
## [40] cli_2.1.0 magrittr_1.5 crayon_1.3.4
## [43] readxl_1.3.1 memoise_1.1.0 evaluate_0.14
## [46] fs_1.5.0 fansi_0.4.1 nlme_3.1-149
## [49] anytime_0.3.9 xml2_1.3.2 tools_4.0.2
## [52] prettyunits_1.1.1 hms_0.5.3 lifecycle_0.2.0
## [55] munsell_0.5.0 reprex_0.3.0 compiler_4.0.2
## [58] rlang_0.4.8 grid_4.0.2 rstudioapi_0.11
## [61] sys_3.4 labeling_0.3 rmarkdown_2.3.8
## [64] gtable_0.3.0 DBI_1.1.0 R6_2.4.1
## [67] gridExtra_2.3 lubridate_1.7.9 knitr_1.30
## [70] utf8_1.1.4 rprojroot_1.3-2 stringi_1.5.3
## [73] Rcpp_1.0.5 ggfittext_0.9.0 dbplyr_1.4.4
## [76] tidyselect_1.1.0 xfun_0.18